# Import the appropriate libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.graph_objs as go#visualization
import plotly.offline as py#visualization
import seaborn as sns
import plotly.express as plots
%matplotlib inline
Sales_df = pd.read_excel('Sales Assessment.xlsx', sheet_name = 'Store Master')
Sales_df.shape
(2586, 28)
Sales_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2586 entries, 0 to 2585 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Store id 2586 non-null int64 1 STAR Dealer ? 2586 non-null int64 2 Revenue Total 2586 non-null int64 3 Region 2586 non-null object 4 Territory 2586 non-null object 5 Size 2560 non-null object 6 PI 2586 non-null float64 7 EligMaj 2475 non-null float64 8 EligConv 1274 non-null float64 9 2021 Customer Base 2568 non-null float64 10 2021 Products (Qty) 2586 non-null int64 11 2021 Revenue ($) 2586 non-null int64 12 2021 PEN 2550 non-null float64 13 2021 $PEN 2550 non-null float64 14 2020 Revenue ($) 2444 non-null float64 15 2020 Contracts ($) 2444 non-null float64 16 2020 Customer Base 2464 non-null float64 17 2020 PEN 2428 non-null float64 18 2020 $PEN 2428 non-null float64 19 Segmentation 2586 non-null object 20 City 2396 non-null object 21 State 2396 non-null object 22 Market type 2396 non-null object 23 10 Year VIOs 2396 non-null float64 24 Service Absorption % 2396 non-null object 25 Trade Zone Retention 2396 non-null float64 26 Tech Counts 2396 non-null float64 27 Stall Counts 2396 non-null float64 dtypes: float64(15), int64(5), object(8) memory usage: 565.8+ KB
Sales_df.head()
| Store id | STAR Dealer ? | Revenue Total | Region | Territory | Size | PI | EligMaj | EligConv | 2021 Customer Base | ... | 2020 $PEN | Segmentation | City | State | Market type | 10 Year VIOs | Service Absorption % | Trade Zone Retention | Tech Counts | Stall Counts | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 820218202 | 0 | 164945 | Northeast | NE06 | NaN | 2.95 | 101.0 | 197.0 | NaN | ... | NaN | F2F | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 100261002 | 0 | 637720 | Midwest | MW03 | NaN | 1.96 | 414.0 | 396.0 | NaN | ... | NaN | F2F | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 150261502 | 0 | 901600 | Mid Atlantic | MA04 | NaN | 2.06 | 739.0 | 786.0 | NaN | ... | NaN | F2F | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 180261802 | 0 | 724205 | Mid Atlantic | MA04 | NaN | 3.11 | 476.0 | 1004.0 | NaN | ... | NaN | F2F | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 210262102 | 0 | 296845 | Southwest | SW03 | NaN | 3.92 | 128.0 | 374.0 | NaN | ... | NaN | F2F | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 28 columns
for i in Sales_df.columns:
if Sales_df.dtypes[i] == 'int64'or Sales_df.dtypes[i]== 'float64':
Sales_df[i] = Sales_df[i].fillna(0)
else :
Sales_df[i] = Sales_df[i].fillna('Unknown')
Sales_df.head()
| Store id | STAR Dealer ? | Revenue Total | Region | Territory | Size | PI | EligMaj | EligConv | 2021 Customer Base | ... | 2020 $PEN | Segmentation | City | State | Market type | 10 Year VIOs | Service Absorption % | Trade Zone Retention | Tech Counts | Stall Counts | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 820218202 | 0 | 164945 | Northeast | NE06 | Unknown | 2.95 | 101.0 | 197.0 | 0.0 | ... | 0.0 | F2F | Unknown | Unknown | Unknown | 0.0 | Unknown | 0.0 | 0.0 | 0.0 |
| 1 | 100261002 | 0 | 637720 | Midwest | MW03 | Unknown | 1.96 | 414.0 | 396.0 | 0.0 | ... | 0.0 | F2F | Unknown | Unknown | Unknown | 0.0 | Unknown | 0.0 | 0.0 | 0.0 |
| 2 | 150261502 | 0 | 901600 | Mid Atlantic | MA04 | Unknown | 2.06 | 739.0 | 786.0 | 0.0 | ... | 0.0 | F2F | Unknown | Unknown | Unknown | 0.0 | Unknown | 0.0 | 0.0 | 0.0 |
| 3 | 180261802 | 0 | 724205 | Mid Atlantic | MA04 | Unknown | 3.11 | 476.0 | 1004.0 | 0.0 | ... | 0.0 | F2F | Unknown | Unknown | Unknown | 0.0 | Unknown | 0.0 | 0.0 | 0.0 |
| 4 | 210262102 | 0 | 296845 | Southwest | SW03 | Unknown | 3.92 | 128.0 | 374.0 | 0.0 | ... | 0.0 | F2F | Unknown | Unknown | Unknown | 0.0 | Unknown | 0.0 | 0.0 | 0.0 |
5 rows × 28 columns
Sales_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2586 entries, 0 to 2585 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Store id 2586 non-null int64 1 STAR Dealer ? 2586 non-null int64 2 Revenue Total 2586 non-null int64 3 Region 2586 non-null object 4 Territory 2586 non-null object 5 Size 2586 non-null object 6 PI 2586 non-null float64 7 EligMaj 2586 non-null float64 8 EligConv 2586 non-null float64 9 2021 Customer Base 2586 non-null float64 10 2021 Products (Qty) 2586 non-null int64 11 2021 Revenue ($) 2586 non-null int64 12 2021 PEN 2586 non-null float64 13 2021 $PEN 2586 non-null float64 14 2020 Revenue ($) 2586 non-null float64 15 2020 Contracts ($) 2586 non-null float64 16 2020 Customer Base 2586 non-null float64 17 2020 PEN 2586 non-null float64 18 2020 $PEN 2586 non-null float64 19 Segmentation 2586 non-null object 20 City 2586 non-null object 21 State 2586 non-null object 22 Market type 2586 non-null object 23 10 Year VIOs 2586 non-null float64 24 Service Absorption % 2586 non-null object 25 Trade Zone Retention 2586 non-null float64 26 Tech Counts 2586 non-null float64 27 Stall Counts 2586 non-null float64 dtypes: float64(15), int64(5), object(8) memory usage: 565.8+ KB
products_sold_df = pd.read_excel('Sales Assessment.xlsx', sheet_name = 'Products Sold by Type ($)')
products_sold_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2589 entries, 0 to 2588 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Store id 2589 non-null object 1 Auto Appearance Care 222 non-null float64 2 CPOV Base Warranties 1531 non-null float64 3 Gap 735 non-null float64 4 Lease Wear & Tear 637 non-null float64 5 Limited Warranties 378 non-null float64 6 LOF 2208 non-null float64 7 Multicare 237 non-null float64 8 New Majors Stated Time 2018 non-null float64 9 New Majors Unlimited 914 non-null float64 10 Owner Care 844 non-null float64 11 Short Term Lease 372 non-null float64 12 Tire & Wheel 557 non-null float64 13 Used Majors Stated Time 1772 non-null float64 14 Used Majors Unlimited 648 non-null float64 15 (blank) 0 non-null float64 16 Grand Total 2588 non-null float64 dtypes: float64(16), object(1) memory usage: 344.0+ KB
products_sold_df.head()
| Store id | Auto Appearance Care | CPOV Base Warranties | Gap | Lease Wear & Tear | Limited Warranties | LOF | Multicare | New Majors Stated Time | New Majors Unlimited | Owner Care | Short Term Lease | Tire & Wheel | Used Majors Stated Time | Used Majors Unlimited | (blank) | Grand Total | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 820218202 | 15065.0 | NaN | NaN | 15600.0 | NaN | NaN | NaN | 102455.0 | NaN | 9320.0 | 1145.0 | 20210.0 | 1150.0 | NaN | NaN | 164945.0 |
| 1 | 100261002 | NaN | NaN | 134400.0 | 1950.0 | NaN | NaN | NaN | 288680.0 | NaN | NaN | 1015.0 | 1840.0 | 209835.0 | NaN | NaN | 637720.0 |
| 2 | 150261502 | NaN | NaN | 248950.0 | 1875.0 | 1440.0 | 8790.0 | 2860.0 | 476190.0 | NaN | 26490.0 | NaN | 20135.0 | 113125.0 | 1745.0 | NaN | 901600.0 |
| 3 | 180261802 | 510.0 | NaN | 218290.0 | 4875.0 | 11690.0 | NaN | 6555.0 | 309685.0 | NaN | 2030.0 | NaN | 86595.0 | 83110.0 | 865.0 | NaN | 724205.0 |
| 4 | 210262102 | 2040.0 | NaN | 114650.0 | NaN | NaN | 5535.0 | NaN | 22955.0 | NaN | 305.0 | NaN | 8165.0 | 138650.0 | 4545.0 | NaN | 296845.0 |
products_sold_df.rename(columns = {'(blank)':'blank'}, inplace = True)
products_sold_df.drop('blank', inplace=True, axis=1)
products_sold_df.head()
| Store id | Auto Appearance Care | CPOV Base Warranties | Gap | Lease Wear & Tear | Limited Warranties | LOF | Multicare | New Majors Stated Time | New Majors Unlimited | Owner Care | Short Term Lease | Tire & Wheel | Used Majors Stated Time | Used Majors Unlimited | Grand Total | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 820218202 | 15065.0 | NaN | NaN | 15600.0 | NaN | NaN | NaN | 102455.0 | NaN | 9320.0 | 1145.0 | 20210.0 | 1150.0 | NaN | 164945.0 |
| 1 | 100261002 | NaN | NaN | 134400.0 | 1950.0 | NaN | NaN | NaN | 288680.0 | NaN | NaN | 1015.0 | 1840.0 | 209835.0 | NaN | 637720.0 |
| 2 | 150261502 | NaN | NaN | 248950.0 | 1875.0 | 1440.0 | 8790.0 | 2860.0 | 476190.0 | NaN | 26490.0 | NaN | 20135.0 | 113125.0 | 1745.0 | 901600.0 |
| 3 | 180261802 | 510.0 | NaN | 218290.0 | 4875.0 | 11690.0 | NaN | 6555.0 | 309685.0 | NaN | 2030.0 | NaN | 86595.0 | 83110.0 | 865.0 | 724205.0 |
| 4 | 210262102 | 2040.0 | NaN | 114650.0 | NaN | NaN | 5535.0 | NaN | 22955.0 | NaN | 305.0 | NaN | 8165.0 | 138650.0 | 4545.0 | 296845.0 |
quantity_data = pd.read_excel('Sales Assessment.xlsx', sheet_name = 'Products Sold by Type (Qty)')
quantity_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2589 entries, 0 to 2588 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Store id 2589 non-null object 1 Grand Total 2588 non-null float64 2 Auto Appearance Care 222 non-null float64 3 CPOV Base Warranties 1531 non-null float64 4 Gap 735 non-null float64 5 Lease Wear & Tear 637 non-null float64 6 Limited Warranties 378 non-null float64 7 LOF 2208 non-null float64 8 Multicare 237 non-null float64 9 New Majors Stated Time 2018 non-null float64 10 New Majors Unlimited 914 non-null float64 11 Owner Care 844 non-null float64 12 Short Term Lease 372 non-null float64 13 Tire & Wheel 557 non-null float64 14 Used Majors Stated Time 1772 non-null float64 15 Used Majors Unlimited 648 non-null float64 16 (blank) 0 non-null float64 dtypes: float64(16), object(1) memory usage: 344.0+ KB
quantity_data.head()
| Store id | Grand Total | Auto Appearance Care | CPOV Base Warranties | Gap | Lease Wear & Tear | Limited Warranties | LOF | Multicare | New Majors Stated Time | New Majors Unlimited | Owner Care | Short Term Lease | Tire & Wheel | Used Majors Stated Time | Used Majors Unlimited | (blank) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 820218202 | 324.0 | 50.0 | NaN | NaN | 52.0 | NaN | NaN | NaN | 98.0 | NaN | 29.0 | 4.0 | 90.0 | 1.0 | NaN | NaN |
| 1 | 100261002 | 810.0 | NaN | NaN | 378.0 | 6.0 | NaN | NaN | NaN | 239.0 | NaN | NaN | 3.0 | 9.0 | 175.0 | NaN | NaN |
| 2 | 150261502 | 1588.0 | NaN | NaN | 693.0 | 5.0 | 12.0 | 16.0 | 7.0 | 622.0 | NaN | 35.0 | NaN | 81.0 | 115.0 | 2.0 | NaN |
| 3 | 180261802 | 1545.0 | 2.0 | NaN | 643.0 | 13.0 | 63.0 | NaN | 13.0 | 406.0 | NaN | 2.0 | NaN | 333.0 | 69.0 | 1.0 | NaN |
| 4 | 210262102 | 521.0 | 9.0 | NaN | 326.0 | NaN | NaN | 18.0 | NaN | 19.0 | NaN | 1.0 | NaN | 39.0 | 107.0 | 2.0 | NaN |
quantity_data = quantity_data.replace(np.nan, 0)
quantity_data.rename(columns = {'(blank)':'blank'}, inplace = True)
quantity_data.drop('blank', inplace=True, axis=1)
quantity_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2589 entries, 0 to 2588 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Store id 2589 non-null object 1 Grand Total 2589 non-null float64 2 Auto Appearance Care 2589 non-null float64 3 CPOV Base Warranties 2589 non-null float64 4 Gap 2589 non-null float64 5 Lease Wear & Tear 2589 non-null float64 6 Limited Warranties 2589 non-null float64 7 LOF 2589 non-null float64 8 Multicare 2589 non-null float64 9 New Majors Stated Time 2589 non-null float64 10 New Majors Unlimited 2589 non-null float64 11 Owner Care 2589 non-null float64 12 Short Term Lease 2589 non-null float64 13 Tire & Wheel 2589 non-null float64 14 Used Majors Stated Time 2589 non-null float64 15 Used Majors Unlimited 2589 non-null float64 dtypes: float64(15), object(1) memory usage: 323.8+ KB
df_qty_price = pd.merge(products_sold_df,quantity_data,how='left',on='Store id',suffixes=('_price', '_qty'))
Sales_df['Store id'] = Sales_df['Store id'].astype(str)
df_store_qty_price = pd.merge(Sales_df,df_qty_price,how='left',on='Store id')
df_store_qty_price.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2586 entries, 0 to 2585 Data columns (total 58 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Store id 2586 non-null object 1 STAR Dealer ? 2586 non-null int64 2 Revenue Total 2586 non-null int64 3 Region 2586 non-null object 4 Territory 2586 non-null object 5 Size 2586 non-null object 6 PI 2586 non-null float64 7 EligMaj 2586 non-null float64 8 EligConv 2586 non-null float64 9 2021 Customer Base 2586 non-null float64 10 2021 Products (Qty) 2586 non-null int64 11 2021 Revenue ($) 2586 non-null int64 12 2021 PEN 2586 non-null float64 13 2021 $PEN 2586 non-null float64 14 2020 Revenue ($) 2586 non-null float64 15 2020 Contracts ($) 2586 non-null float64 16 2020 Customer Base 2586 non-null float64 17 2020 PEN 2586 non-null float64 18 2020 $PEN 2586 non-null float64 19 Segmentation 2586 non-null object 20 City 2586 non-null object 21 State 2586 non-null object 22 Market type 2586 non-null object 23 10 Year VIOs 2586 non-null float64 24 Service Absorption % 2586 non-null object 25 Trade Zone Retention 2586 non-null float64 26 Tech Counts 2586 non-null float64 27 Stall Counts 2586 non-null float64 28 Auto Appearance Care_price 197 non-null float64 29 CPOV Base Warranties_price 1360 non-null float64 30 Gap_price 662 non-null float64 31 Lease Wear & Tear_price 570 non-null float64 32 Limited Warranties_price 339 non-null float64 33 LOF_price 1979 non-null float64 34 Multicare_price 211 non-null float64 35 New Majors Stated Time_price 1812 non-null float64 36 New Majors Unlimited_price 821 non-null float64 37 Owner Care_price 756 non-null float64 38 Short Term Lease_price 333 non-null float64 39 Tire & Wheel_price 503 non-null float64 40 Used Majors Stated Time_price 1590 non-null float64 41 Used Majors Unlimited_price 583 non-null float64 42 Grand Total_price 2313 non-null float64 43 Grand Total_qty 2312 non-null float64 44 Auto Appearance Care_qty 2312 non-null float64 45 CPOV Base Warranties_qty 2312 non-null float64 46 Gap_qty 2312 non-null float64 47 Lease Wear & Tear_qty 2312 non-null float64 48 Limited Warranties_qty 2312 non-null float64 49 LOF_qty 2312 non-null float64 50 Multicare_qty 2312 non-null float64 51 New Majors Stated Time_qty 2312 non-null float64 52 New Majors Unlimited_qty 2312 non-null float64 53 Owner Care_qty 2312 non-null float64 54 Short Term Lease_qty 2312 non-null float64 55 Tire & Wheel_qty 2312 non-null float64 56 Used Majors Stated Time_qty 2312 non-null float64 57 Used Majors Unlimited_qty 2312 non-null float64 dtypes: float64(45), int64(4), object(9) memory usage: 1.2+ MB
star_df = df_store_qty_price[df_store_qty_price['STAR Dealer ?'] !=0]
fig = plots.box(star_df , y="Revenue Total", title='Annual revenue based on Star dealers').update_traces(marker=dict(color='Purple'))
fig.show()
nonstar_df = df_store_qty_price[df_store_qty_price['STAR Dealer ?'] ==0]
fig = plots.box(nonstar_df , y="Revenue Total", title='Annual revenue based on Non-Star dealers').update_traces(marker=dict(color='Green'))
fig.show()
From above 2 box plot we found that STAR dealer have made a maximum revenue comapare to non-STAR dealer and there are some store which have made a quite high revenue which can show here as an outliers.
Size_A_df = df_store_qty_price[df_store_qty_price['Size'] =='A']
fig = plots.box(Size_A_df , y="Revenue Total", title='Annual revenue based on small amount of customer size(A)').update_traces(marker=dict(color='Green'))
fig.show()
Size_E_df = df_store_qty_price[df_store_qty_price['Size'] =='E']
fig = plots.box(Size_E_df , y="Revenue Total", title='Annual revenue based on large amount of customer size(E)').update_traces(marker=dict(color='Green'))
fig.show()
Boxplots of customer size A and E tells that Stores have made more revenue with large amount of customer size
star_cus = star_df[star_df['2021 Customer Base'] < 2000]
nonstar_cus = nonstar_df[nonstar_df["2021 Customer Base"] <2000]
trace1 = go.Histogram(x = star_cus["2021 Customer Base"],histnorm= "percent",name = "Star Dealers",marker = dict(line = dict(width = .5,color = "red")),
opacity = .9)
trace2 = go.Histogram(x = nonstar_cus["2021 Customer Base"], histnorm = "percent",name = "Non Star Dealers",marker = dict(line = dict(width = .5,color = "black")),
opacity = .9)
data_1 = [trace1,trace2]
layout = go.Layout(dict(title ="2021 Customer Base" + " distribution as per dealer type ", plot_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "2021 Customer Base",
zerolinewidth=1,
ticklen=5,
gridwidth=2
),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',title = "Percent of stores",zerolinewidth=1,ticklen=5, gridwidth=2),))
fig = go.Figure(data = data_1,layout=layout)
py.iplot(fig)
There are several stores that have under 1000 customers.
star_qty = star_df[star_df['2021 Products (Qty)'] < 3000]
nonstar_qty = nonstar_df[nonstar_df['2021 Products (Qty)'] < 3000]
trace1 = go.Histogram(x = star_qty['2021 Products (Qty)'],histnorm= "percent",name = "Star Dealers",marker = dict(line = dict(width = .5,color = "black")),
opacity = .9)
trace2 = go.Histogram(x = nonstar_qty['2021 Products (Qty)'], histnorm = "percent",name = "Non Star Dealers",marker = dict(line = dict(width = .5,color = "black")),
opacity = .9)
data_1 = [trace1,trace2]
layout = go.Layout(dict(title ='2021 Products (Qty)' + " distribution ",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',title = '2021 Products (Qty)',),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',title = "Percent of stores",zerolinewidth=1,ticklen=5, gridwidth=2),))
fig = go.Figure(data = data_1,layout=layout)
py.iplot(fig)
There are some products qty which have highest percent of stores.
trace1 = go.Histogram(x = star_df['2021 Revenue ($)'],histnorm= "percent",name = "Star Dealers",marker = dict(line = dict(width = .5,color = "black")),
opacity = .9)
trace2 = go.Histogram(x = nonstar_df['2021 Revenue ($)'], histnorm = "percent",name = "Non Star Dealers",marker = dict(line = dict(width = .5,color = "black")),
opacity = .9)
data_1 = [trace1,trace2]
layout = go.Layout(dict(title ='2021 Revenue ($)' + " distribution ",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',title = '2021 Revenue ($)',),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',title = "Percent of stores",zerolinewidth=1,ticklen=5, gridwidth=2),))
fig = go.Figure(data = data_1,layout=layout)
py.iplot(fig)
Large amount of store have made a revenue under 0.5 millions in the year 2021.
trace1 = go.Histogram(x = star_df['2020 Revenue ($)'],histnorm= "percent",name = "Star Dealers",marker = dict(line = dict(width = .5,color = "black")),
opacity = .9)
trace2 = go.Histogram(x = nonstar_df['2020 Revenue ($)'], histnorm = "percent",name = "Non Star Dealers",marker = dict(line = dict(width = .5,color = "black")),
opacity = .9)
data_1 = [trace1,trace2]
layout = go.Layout(dict(title ='2020 Revenue ($)' + " distribution ",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)', title = '2020 Revenue ($)',),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',title = "Percent of stores",zerolinewidth=1,ticklen=5, gridwidth=2),))
fig = go.Figure(data = data_1,layout=layout)
py.iplot(fig)
Large amount of store have made a revenue under 1 millions in the year 2020.
Sort_revenue = df_store_qty_price.sort_values("Revenue Total",ascending = False)
top_most = Sort_revenue[:10]
top_most
| Store id | STAR Dealer ? | Revenue Total | Region | Territory | Size | PI | EligMaj | EligConv | 2021 Customer Base | ... | Limited Warranties_qty | LOF_qty | Multicare_qty | New Majors Stated Time_qty | New Majors Unlimited_qty | Owner Care_qty | Short Term Lease_qty | Tire & Wheel_qty | Used Majors Stated Time_qty | Used Majors Unlimited_qty | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1640 | 825498254 | 1 | 3891040 | Midwest | MW01 | E | 2.00 | 1879.0 | 1873.0 | 1770.0 | ... | 298.0 | 2712.0 | 31.0 | 747.0 | 22.0 | 3.0 | 428.0 | 359.0 | 1026.0 | 7.0 |
| 1021 | 894338943 | 1 | 3731565 | West | WE01 | E | 1.06 | 2892.0 | 176.0 | 3980.0 | ... | 0.0 | 985.0 | 0.0 | 2258.0 | 13.0 | 26.0 | 46.0 | 0.0 | 574.0 | 1.0 |
| 26 | 52410524 | 1 | 3729530 | California | CA01 | E | 2.14 | 1967.0 | 2238.0 | 1975.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 27 | 2410024 | 1 | 3521775 | California | CA02 | E | 1.46 | 2624.0 | 1214.0 | 3899.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2247 | 326673266 | 1 | 3466775 | Southeast | SE05 | E | 2.68 | 1570.0 | 2634.0 | 1626.0 | ... | 0.0 | 600.0 | 93.0 | 766.0 | 12.0 | 235.0 | 6.0 | 948.0 | 742.0 | 7.0 |
| 1868 | 56050560 | 1 | 3461420 | Great Lakes | GL07 | B | 1.16 | 1932.0 | 303.0 | 64.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1086 | 314393143 | 1 | 3381150 | West | WE01 | E | 1.47 | 1945.0 | 907.0 | 2414.0 | ... | 391.0 | 4148.0 | 4.0 | 1103.0 | 15.0 | 115.0 | 0.0 | 0.0 | 807.0 | 3.0 |
| 2373 | 486834868 | 1 | 3271855 | Mid Atlantic | MA05 | E | 1.04 | 2711.0 | 120.0 | 1962.0 | ... | 0.0 | 458.0 | 0.0 | 1721.0 | 6.0 | 1324.0 | 2.0 | 14.0 | 948.0 | 0.0 |
| 1277 | 274502745 | 0 | 3237280 | Mid Atlantic | MA04 | E | 1.93 | 1456.0 | 1360.0 | 1439.0 | ... | 79.0 | 971.0 | 32.0 | 908.0 | 13.0 | 488.0 | 1.0 | 197.0 | 519.0 | 2.0 |
| 1635 | 945419454 | 1 | 3149265 | Mid Atlantic | MA04 | E | 2.58 | 1359.0 | 2143.0 | 1302.0 | ... | 429.0 | 3349.0 | 1.0 | 698.0 | 9.0 | 176.0 | 1.0 | 814.0 | 637.0 | 1.0 |
10 rows × 58 columns
revenue_21 = top_most.loc[:, ["2021 Revenue ($)"]]
revenue_20 = top_most.loc[:, ["2020 Revenue ($)"]]
customers = top_most.loc[:, ["2021 Customer Base", "2020 Customer Base"]]
plot1 = pd.DataFrame(revenue_21.values).plot(
figsize=(15,7), kind='bar', color='Skyblue')
pd.DataFrame(revenue_20.values).plot(
ax=plot1,
grid=True,color="darkblue",kind='bar', alpha=0.3)
plot1.set_xlabel('Store ID')
plot1.set_ylabel('Revenue made')
plot1.set_title("2020 V/S 2021 Revenue and Customer base for top 10 stores by total revenue in billions")
plot1.legend(['Revenue 2021', 'Revenue 2020'], loc = 'upper left')
plot1.set_ylim([0, revenue_20.values.max()+1000])
plt.xticks( range(len(top_most['Store id'])), top_most['Store id'], rotation=45,color="red")
plt.show()
By comparing the revenue of 2020 and 2021 of top 10 stores, found that almost revenue is increased in the year 2021 compare to 2020.
plot1 = pd.DataFrame(revenue_21.values).plot(figsize=(15,7))
plot2 = customers.plot(ax = plot1, y=["2021 Customer Base","2020 Customer Base"], kind="bar",grid=True,secondary_y=False)
plot2.set_ylabel('Number of customers')
plot2.legend(['Customer base 2021','Customer base 2020'], loc='upper right')
plot2.set_ylim([0, 10000 ])
plt.xticks(
range(len(top_most['Store id'])), top_most['Store id'],
rotation=45
)
plt.show()
Product_Index = df_store_qty_price.sort_values('PI',ascending=False)
Product_Index[['Store id','PI', 'Revenue Total', 'Size']].iloc[:20]
| Store id | PI | Revenue Total | Size | |
|---|---|---|---|---|
| 239 | 370263702 | 119.75 | 125450 | Unknown |
| 133 | 964579645 | 83.00 | 31500 | B |
| 2583 | 66940669 | 40.00 | 35830 | C |
| 2548 | 96930969 | 35.00 | 15140 | A |
| 2493 | 86900869 | 28.63 | 117475 | C |
| 587 | 902659026 | 28.60 | 128895 | B |
| 811 | 832708327 | 17.50 | 13005 | B |
| 2575 | 646936469 | 15.50 | 68895 | B |
| 1378 | 64540645 | 14.64 | 58270 | A |
| 864 | 563835638 | 13.47 | 564880 | E |
| 2105 | 126201262 | 13.10 | 361285 | E |
| 283 | 256082560 | 13.00 | 3995 | C |
| 234 | 86940869 | 12.72 | 228175 | D |
| 1780 | 276032760 | 12.13 | 198895 | C |
| 1221 | 824478244 | 12.00 | 1684 | A |
| 601 | 312673126 | 11.50 | 44020 | B |
| 1758 | 486024860 | 11.00 | 2574 | B |
| 2282 | 826728267 | 10.39 | 331160 | E |
| 1191 | 184461844 | 10.28 | 219311 | E |
| 2502 | 486904869 | 10.10 | 78635 | B |
revenue = df_store_qty_price.sort_values('Revenue Total',ascending=False)
revenue = revenue[['Store id', 'Revenue Total', 'PI', 'Size', 'STAR Dealer ?']].iloc[:20]
revenue
| Store id | Revenue Total | PI | Size | STAR Dealer ? | |
|---|---|---|---|---|---|
| 1640 | 825498254 | 3891040 | 2.00 | E | 1 |
| 1021 | 894338943 | 3731565 | 1.06 | E | 1 |
| 26 | 52410524 | 3729530 | 2.14 | E | 1 |
| 27 | 2410024 | 3521775 | 1.46 | E | 1 |
| 2247 | 326673266 | 3466775 | 2.68 | E | 1 |
| 1868 | 56050560 | 3461420 | 1.16 | B | 1 |
| 1086 | 314393143 | 3381150 | 1.47 | E | 1 |
| 2373 | 486834868 | 3271855 | 1.04 | E | 1 |
| 1277 | 274502745 | 3237280 | 1.93 | E | 0 |
| 1635 | 945419454 | 3149265 | 2.58 | E | 1 |
| 517 | 572365723 | 2867325 | 1.43 | E | 1 |
| 1274 | 204502045 | 2828250 | 1.00 | E | 0 |
| 35 | 556485564 | 2548888 | 2.93 | E | 0 |
| 1279 | 314503145 | 2522460 | 1.97 | E | 1 |
| 1341 | 864528645 | 2487800 | 2.39 | E | 1 |
| 49 | 814208142 | 2447995 | 2.08 | E | 1 |
| 2222 | 596615966 | 2424210 | 1.10 | B | 1 |
| 1723 | 156011560 | 2360065 | 1.31 | E | 1 |
| 636 | 232682326 | 2258700 | 1.01 | E | 0 |
| 1398 | 564545645 | 2224380 | 1.88 | E | 1 |
fig = plt.gcf()
fig.set_size_inches(10, 6)
ax = sns.barplot(data=revenue, x='Store id', y='Revenue Total', hue='Size')
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
plt.title('Top 20 Stores based on annual revenue categorized by customer size')
plt.show()
Mostly the stores have the large number of customers which have made the highest revenue except 2 store (56050560 and 596615966) have the small number of customers.
df_store_qty_price= df_store_qty_price.sort_values("Revenue Total",ascending = False)
top_10 =df_store_qty_price[:10]
top_10
top_10 = top_10[['Store id','STAR Dealer ?', 'Revenue Total']]
top_10
| Store id | STAR Dealer ? | Revenue Total | |
|---|---|---|---|
| 1640 | 825498254 | 1 | 3891040 |
| 1021 | 894338943 | 1 | 3731565 |
| 26 | 52410524 | 1 | 3729530 |
| 27 | 2410024 | 1 | 3521775 |
| 2247 | 326673266 | 1 | 3466775 |
| 1868 | 56050560 | 1 | 3461420 |
| 1086 | 314393143 | 1 | 3381150 |
| 2373 | 486834868 | 1 | 3271855 |
| 1277 | 274502745 | 0 | 3237280 |
| 1635 | 945419454 | 1 | 3149265 |
fig = plt.gcf()
fig.set_size_inches(12, 8)
ax = sns.barplot(data=top_10, x='Store id', y='Revenue Total', hue='STAR Dealer ?')
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
plt.title('Top 10 Stores based on annual revenue categorized by Star and non-star Dealer')
plt.show()
Found that Store id (56050560) dealer has attended the star program based on that we can say that star program is somewhat benifical to improve the revenue.
df_store_qty_price= df_store_qty_price.sort_values("Revenue Total",ascending = False)
top_10 =df_store_qty_price[:10]
top_10[['Store id', 'STAR Dealer ?', 'Revenue Total', 'Grand Total_qty']]
| Store id | STAR Dealer ? | Revenue Total | Grand Total_qty | |
|---|---|---|---|---|
| 1640 | 825498254 | 1 | 3891040 | 7852.0 |
| 1021 | 894338943 | 1 | 3731565 | 4618.0 |
| 26 | 52410524 | 1 | 3729530 | NaN |
| 27 | 2410024 | 1 | 3521775 | NaN |
| 2247 | 326673266 | 1 | 3466775 | 5717.0 |
| 1868 | 56050560 | 1 | 3461420 | NaN |
| 1086 | 314393143 | 1 | 3381150 | 8122.0 |
| 2373 | 486834868 | 1 | 3271855 | 5500.0 |
| 1277 | 274502745 | 0 | 3237280 | 4874.0 |
| 1635 | 945419454 | 1 | 3149265 | 8081.0 |
top_10.plot(x ='Store id', y='Grand Total_qty', kind = 'bar',color="#bfef45", legend=False)
plt.ylabel("Products sold")
plt.xlabel('Store')
plt.title("Top 10 stores by products' quantity")
plt.show()
df_store_qty_price.sort_values("PI", ascending = False).iloc[:10,:]
| Store id | STAR Dealer ? | Revenue Total | Region | Territory | Size | PI | EligMaj | EligConv | 2021 Customer Base | ... | Limited Warranties_qty | LOF_qty | Multicare_qty | New Majors Stated Time_qty | New Majors Unlimited_qty | Owner Care_qty | Short Term Lease_qty | Tire & Wheel_qty | Used Majors Stated Time_qty | Used Majors Unlimited_qty | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 239 | 370263702 | 0 | 125450 | Mid Atlantic | MA04 | Unknown | 119.75 | 4.0 | 475.0 | 0.0 | ... | 0.0 | 0.0 | 74.0 | 2.0 | 0.0 | 0.0 | 0.0 | 376.0 | 2.0 | 0.0 |
| 133 | 964579645 | 0 | 31500 | Northeast | PNE1 | B | 83.00 | 1.0 | 82.0 | 311.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2583 | 66940669 | 0 | 35830 | Northeast | NEB1 | C | 40.00 | 2.0 | 78.0 | 21.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2548 | 96930969 | 0 | 15140 | Southeast | PSE1 | A | 35.00 | 1.0 | 34.0 | 89.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2493 | 86900869 | 0 | 117475 | Great Lakes | GL05 | C | 28.63 | 8.0 | 221.0 | 77.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 587 | 902659026 | 0 | 128895 | Northeast | NEB1 | B | 28.60 | 5.0 | 138.0 | 229.0 | ... | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 | 1.0 | 0.0 | 3.0 | 0.0 |
| 811 | 832708327 | 0 | 13005 | Mid Atlantic | PMA1 | B | 17.50 | 2.0 | 33.0 | 192.0 | ... | 0.0 | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2575 | 646936469 | 0 | 68895 | Northeast | PNE1 | B | 15.50 | 8.0 | 116.0 | 26.0 | ... | 0.0 | 1.0 | 4.0 | 0.0 | 0.0 | 1.0 | 0.0 | 112.0 | 7.0 | 1.0 |
| 1378 | 64540645 | 0 | 58270 | Midwest | PMW1 | A | 14.64 | 11.0 | 150.0 | 110.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 864 | 563835638 | 0 | 564880 | Denver | DN03 | E | 13.47 | 36.0 | 449.0 | 1181.0 | ... | 0.0 | 1059.0 | 0.0 | 18.0 | 3.0 | 0.0 | 0.0 | 0.0 | 4.0 | 2.0 |
10 rows × 58 columns
Sort_PI = df_store_qty_price.sort_values("PI", ascending = False)
Size_A = df_store_qty_price[df_store_qty_price['Size'] == "A"]
Sort_PI = Size_A.sort_values("PI", ascending = False)
PI_Size = df_store_qty_price.groupby('Size').agg({'PI':'mean'})
PI_Size
| PI | |
|---|---|
| Size | |
| A | 1.184033 |
| B | 1.484414 |
| C | 1.438180 |
| D | 1.404734 |
| E | 1.553114 |
| Unknown | 6.521154 |
#annual_size = df_store_qty_price[df_store_qty_price['Size']!= 'Unknown']
annual_size = df_store_qty_price.groupby('Size').agg({'Revenue Total':'mean'})
annual_size
| Revenue Total | |
|---|---|
| Size | |
| A | 36291.962963 |
| B | 102340.224464 |
| C | 170869.799250 |
| D | 321170.480676 |
| E | 618612.706587 |
| Unknown | 354346.730769 |
colors = sns.color_palette('colorblind')[0:6]
plt.pie(annual_size['Revenue Total'], labels = annual_size.index, colors = colors, autopct='%.0f%%')
plt.title("Annual Revenue based on customer size")
plt.show()
As the number of customers increased to come in the stores annual revenue also increase.
store_size = df_store_qty_price.groupby('Size').agg({'Size':'count'})
store_size
| Size | |
|---|---|
| Size | |
| A | 486 |
| B | 793 |
| C | 533 |
| D | 414 |
| E | 334 |
| Unknown | 26 |
colors = sns.color_palette('colorblind')[0:6]
plt.pie(store_size['Size'], labels = store_size.index, colors = colors, autopct='%.0f%%')
plt.title("Number of stores based on customer size")
plt.show()
We have a more stores which have the small number of customers(B Size).
#labels
lab = df_store_qty_price["Size"].value_counts().keys().tolist()
#values
val = df_store_qty_price["Size"].value_counts().values.tolist()
trace = go.Pie(labels = lab ,
values = val ,
marker = dict(colors = [ 'royalblue' ,'lime'],
line = dict(color = "white",
width = 1.3)
),
rotation = 90,
hoverinfo = "label+value+text",
hole = .5
)
layout = go.Layout(dict(title = "Size Small vs Large",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
)
data = [trace]
fig = go.Figure(data = data,layout = layout)
py.iplot(fig)
Region_revenue = df_store_qty_price.groupby('Region').agg({'Revenue Total':'mean'})
Region_revenue.sort_values('Revenue Total',ascending = False)
| Revenue Total | |
|---|---|
| Region | |
| California | 384341.900000 |
| West | 281842.597701 |
| Southwest | 236934.628571 |
| Mid Atlantic | 229776.635088 |
| Northeast | 204004.118380 |
| Southeast | 200960.845144 |
| Great Lakes | 190811.128834 |
| Midwest | 136869.097403 |
| Denver | 127644.883162 |
colors = sns.color_palette('bright')[0:9]
plt.pie(Region_revenue['Revenue Total'], labels = Region_revenue.index, colors = colors, autopct='%.0f%%')
plt.show()
California has made the maximum revenue and Denver and Midwest have made the least revenue.
California = df_store_qty_price[df_store_qty_price['Region']=='California']
California = California.sort_values('Revenue Total',ascending = False).iloc[:10,:]
California[['Revenue Total','2021 Revenue ($)','2020 Revenue ($)',"2021 Customer Base", "2020 Customer Base"]]
| Revenue Total | 2021 Revenue ($) | 2020 Revenue ($) | 2021 Customer Base | 2020 Customer Base | |
|---|---|---|---|---|---|
| 26 | 3729530 | 3813833 | 3289122.0 | 1975.0 | 1826.0 |
| 27 | 3521775 | 3779185 | 5636386.0 | 3899.0 | 3827.0 |
| 35 | 2548888 | 2682038 | 2189248.0 | 1224.0 | 1318.0 |
| 49 | 2447995 | 2564775 | 2404226.0 | 1579.0 | 1702.0 |
| 50 | 1728260 | 1762230 | 1431255.0 | 1288.0 | 1249.0 |
| 61 | 1579130 | 1639210 | 1625898.0 | 789.0 | 777.0 |
| 62 | 1493845 | 1537910 | 992105.0 | 1174.0 | 949.0 |
| 66 | 1477755 | 1536365 | 1862122.0 | 1563.0 | 1500.0 |
| 94 | 1177365 | 1439155 | 1425236.0 | 3167.0 | 3387.0 |
| 102 | 1107975 | 1161470 | 1043027.0 | 601.0 | 632.0 |
revenue_21 = California.loc[:, ["2021 Revenue ($)"]]
revenue_20 = California.loc[:, ["2020 Revenue ($)"]]
customers = California.loc[:, ["2021 Customer Base", "2020 Customer Base"]]
plot1 = pd.DataFrame(revenue_21.values).plot(
figsize=(15,7), kind='bar', color='Skyblue')
pd.DataFrame(revenue_20.values).plot(
ax=plot1,
grid=True,color="darkblue",kind='bar', alpha=0.3)
plot1.set_xlabel('Store ID')
plot1.set_ylabel('Revenue made')
plot1.set_title("California's 2020 V/S 2021 Revenue and Customer base for top 10 stores by total revenue in billions")
plot1.legend(['Revenue 2021', 'Revenue 2020'], loc = 'upper left')
plot1.set_ylim([0, revenue_20.values.max()+1000])
plt.xticks( range(len(top_most['Store id'])), top_most['Store id'], rotation=45,color="red")
plt.show()
Mostly revenue is increased in the year 2021 in California.
Denver = df_store_qty_price[df_store_qty_price['Region']=='Denver']
Denver = Denver.sort_values('Revenue Total',ascending = False).iloc[:10,:]
Denver[['Revenue Total','2021 Revenue ($)','2020 Revenue ($)',"2021 Customer Base", "2020 Customer Base"]]
| Revenue Total | 2021 Revenue ($) | 2020 Revenue ($) | 2021 Customer Base | 2020 Customer Base | |
|---|---|---|---|---|---|
| 1294 | 1371325 | 1394325 | 1533387.0 | 1215.0 | 1318.0 |
| 210 | 1315154 | 1336779 | 1228319.0 | 594.0 | 620.0 |
| 542 | 1122025 | 1141310 | 825917.0 | 988.0 | 1098.0 |
| 1800 | 810625 | 884880 | 1217566.0 | 1182.0 | 1015.0 |
| 1063 | 802670 | 850455 | 670382.0 | 802.0 | 854.0 |
| 1967 | 783110 | 798210 | 719925.0 | 943.0 | 867.0 |
| 2489 | 767120 | 816100 | 977149.0 | 949.0 | 945.0 |
| 1541 | 752085 | 752085 | 694360.0 | 213.0 | 295.0 |
| 1714 | 752065 | 773185 | 535300.0 | 618.0 | 626.0 |
| 544 | 727440 | 735725 | 370355.0 | 348.0 | 314.0 |
revenue_21 = Denver.loc[:, ["2021 Revenue ($)"]]
revenue_20 = Denver.loc[:, ["2020 Revenue ($)"]]
customers = Denver.loc[:, ["2021 Customer Base", "2020 Customer Base"]]
plot1 = pd.DataFrame(revenue_21.values).plot(
figsize=(15,7), kind='bar', color='Skyblue')
pd.DataFrame(revenue_20.values).plot(
ax=plot1,
grid=True,color="darkblue",kind='bar', alpha=0.3)
plot1.set_xlabel('Store ID')
plot1.set_ylabel('Revenue made')
plot1.set_title("Denever's 2020 V/S 2021 Revenue and Customer base for top 10 stores by total revenue in billions")
plot1.legend(['Revenue 2021', 'Revenue 2020'], loc = 'upper right')
plot1.set_ylim([0, revenue_20.values.max()+1000])
plt.xticks( range(len(top_most['Store id'])), top_most['Store id'], rotation=45,color="red")
plt.show()
UnitPrice = df_store_qty_price['Gap_price']/df_store_qty_price['Gap_qty']
UnitPrice
1640 362.762747
1021 NaN
26 NaN
27 NaN
2247 378.098182
...
419 NaN
141 NaN
1945 NaN
355 NaN
311 NaN
Length: 2586, dtype: float64
Sales_df.corrwith(Sales_df['Revenue Total']).plot.bar(figsize = (20, 10), title = 'Correlation of variables with Total Revenue',
fontsize = 15, rot = 45, grid = True,color="skyBlue")
<AxesSubplot:title={'center':'Correlation of variables with Total Revenue'}>
# Create a plot for 2021 Revenue and Customer Base
facet = sns.lmplot(data = Sales_df, y='2021 Revenue ($)', x='2021 Customer Base', hue='Market type',
fit_reg=False, legend=True, legend_out=True)
C:\Users\dhruv\anaconda3\envs\DAB300\lib\site-packages\seaborn\regression.py:592: UserWarning: legend_out is deprecated from the `lmplot` function signature. Please update your code to pass it using `facet_kws`.
key part of a store's service is the amount of available technicians and service bays in their region; the more, the better, as it makes it easier for consumers to get services and improves their overall experience.
With the exception of a few exceptional circumstances, urban market outlets have the most technicians and service centres available (smaller stores not being able to leverage the market type benefits). With their larger customer base and income statistics, this could be one of the causes, i.e. ease of receiving services.
Those in the ND market had the lowest count for both, which makes reasonable given their lacklustre performance over the past two years.
df_store_qty_price = df_store_qty_price.rename(columns = {'2020 PEN' : 'PEN0','2021 PEN' : 'PEN1'})
pen_1 = df_store_qty_price.groupby(['Market type'],as_index=False).PEN0.mean()
pen_2 = df_store_qty_price.groupby(['Market type'],as_index=False).PEN1.mean()
pen = pd.merge(pen_1, pen_2, on='Market type', how='outer')
pen = pen[:4]
pen
| Market type | PEN0 | PEN1 | |
|---|---|---|---|
| 0 | Metro | 1.336751 | 1.378242 |
| 1 | ND | 0.803764 | 0.738794 |
| 2 | Rural | 0.851107 | 0.818327 |
| 3 | Secondary | 1.146954 | 1.224785 |
pen.plot(x="Market type", y=["PEN0","PEN1"], kind="bar")
plt.ylabel("Average products sold per customer")
plt.xlabel("Market Type")
plt.legend(['Average 2020 PEN','Average 2021 PEN'], loc='center left', bbox_to_anchor=(1, 0.5))
plt.title("Average products sold per customer as per market type")
plt.show()
The graph above compares the average number of products sold per customer in 2020 and 2021 for our four market types.
Because the performance of stores in metro markets is incomparable, the selling rate is higher for both years for metro market type, as one would expect.
There is also a noticeable improvement from 2020 to 2021, which is a favourable observation.
Secondary market retailers are in second place, with a slightly lower PEN rate than metro, and a considerable improvement from 2020 to 2021.
products_sold_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2589 entries, 0 to 2588 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Store id 2589 non-null object 1 Auto Appearance Care 222 non-null float64 2 CPOV Base Warranties 1531 non-null float64 3 Gap 735 non-null float64 4 Lease Wear & Tear 637 non-null float64 5 Limited Warranties 378 non-null float64 6 LOF 2208 non-null float64 7 Multicare 237 non-null float64 8 New Majors Stated Time 2018 non-null float64 9 New Majors Unlimited 914 non-null float64 10 Owner Care 844 non-null float64 11 Short Term Lease 372 non-null float64 12 Tire & Wheel 557 non-null float64 13 Used Majors Stated Time 1772 non-null float64 14 Used Majors Unlimited 648 non-null float64 15 Grand Total 2588 non-null float64 dtypes: float64(15), object(1) memory usage: 323.8+ KB
attrib=products_sold_df.sum(axis = 0, skipna = True)
df=pd.DataFrame(attrib).reset_index()
df.columns = ['columns', 'sum-of']
df = df.drop([df.index[0] , df.index[15]])
df
| columns | sum-of | |
|---|---|---|
| 1 | Auto Appearance Care | 2398680.0 |
| 2 | CPOV Base Warranties | 253852080.0 |
| 3 | Gap | 47144008.0 |
| 4 | Lease Wear & Tear | 13553280.0 |
| 5 | Limited Warranties | 3968460.0 |
| 6 | LOF | 132639420.0 |
| 7 | Multicare | 2479010.0 |
| 8 | New Majors Stated Time | 329777162.0 |
| 9 | New Majors Unlimited | 21340442.0 |
| 10 | Owner Care | 47971376.0 |
| 11 | Short Term Lease | 5290904.0 |
| 12 | Tire & Wheel | 13251490.0 |
| 13 | Used Majors Stated Time | 221342860.0 |
| 14 | Used Majors Unlimited | 8594790.0 |
df.sum (axis=0)
columns Auto Appearance CareCPOV Base WarrantiesGapLea... sum-of 1103603962.0 dtype: object
df.plot(x ='columns', y='sum-of', kind = 'bar',color="skyblue")
plt.xlabel("attribute ")
plt.ylabel('sum-of- attributet')
plt.title("Top 10 stores by grand total")
plt.show()
From the above plot we could see " CPOV Base Warranties" , "New Majors Stated Time", " Used Majors Stated Time" brings the most revenue.
top_10
| Store id | STAR Dealer ? | Revenue Total | Region | Territory | Size | PI | EligMaj | EligConv | 2021 Customer Base | ... | Limited Warranties_qty | LOF_qty | Multicare_qty | New Majors Stated Time_qty | New Majors Unlimited_qty | Owner Care_qty | Short Term Lease_qty | Tire & Wheel_qty | Used Majors Stated Time_qty | Used Majors Unlimited_qty | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1640 | 825498254 | 1 | 3891040 | Midwest | MW01 | E | 2.00 | 1879.0 | 1873.0 | 1770.0 | ... | 298.0 | 2712.0 | 31.0 | 747.0 | 22.0 | 3.0 | 428.0 | 359.0 | 1026.0 | 7.0 |
| 1021 | 894338943 | 1 | 3731565 | West | WE01 | E | 1.06 | 2892.0 | 176.0 | 3980.0 | ... | 0.0 | 985.0 | 0.0 | 2258.0 | 13.0 | 26.0 | 46.0 | 0.0 | 574.0 | 1.0 |
| 26 | 52410524 | 1 | 3729530 | California | CA01 | E | 2.14 | 1967.0 | 2238.0 | 1975.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 27 | 2410024 | 1 | 3521775 | California | CA02 | E | 1.46 | 2624.0 | 1214.0 | 3899.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2247 | 326673266 | 1 | 3466775 | Southeast | SE05 | E | 2.68 | 1570.0 | 2634.0 | 1626.0 | ... | 0.0 | 600.0 | 93.0 | 766.0 | 12.0 | 235.0 | 6.0 | 948.0 | 742.0 | 7.0 |
| 1868 | 56050560 | 1 | 3461420 | Great Lakes | GL07 | B | 1.16 | 1932.0 | 303.0 | 64.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1086 | 314393143 | 1 | 3381150 | West | WE01 | E | 1.47 | 1945.0 | 907.0 | 2414.0 | ... | 391.0 | 4148.0 | 4.0 | 1103.0 | 15.0 | 115.0 | 0.0 | 0.0 | 807.0 | 3.0 |
| 2373 | 486834868 | 1 | 3271855 | Mid Atlantic | MA05 | E | 1.04 | 2711.0 | 120.0 | 1962.0 | ... | 0.0 | 458.0 | 0.0 | 1721.0 | 6.0 | 1324.0 | 2.0 | 14.0 | 948.0 | 0.0 |
| 1277 | 274502745 | 0 | 3237280 | Mid Atlantic | MA04 | E | 1.93 | 1456.0 | 1360.0 | 1439.0 | ... | 79.0 | 971.0 | 32.0 | 908.0 | 13.0 | 488.0 | 1.0 | 197.0 | 519.0 | 2.0 |
| 1635 | 945419454 | 1 | 3149265 | Mid Atlantic | MA04 | E | 2.58 | 1359.0 | 2143.0 | 1302.0 | ... | 429.0 | 3349.0 | 1.0 | 698.0 | 9.0 | 176.0 | 1.0 | 814.0 | 637.0 | 1.0 |
10 rows × 58 columns
decending= products_sold_df.sort_values("Grand Total",ascending = False)
top_10 =decending[:10]
top_10 = top_10.drop([top_10.index[0]])
top_10
top_10.plot(x ='Store id', y='New Majors Stated Time', kind = 'bar', color="green")
plt.ylabel("New Majors Stated Time")
plt.xlabel('Store')
plt.title("New Majors stated time sales by top 10 stores")
plt.show()
decendng= products_sold_df.sort_values("Grand Total",ascending = False)
top_10 =decendng[:10]
top_10 = top_10.drop([top_10.index[0]])
top_10
| Store id | Auto Appearance Care | CPOV Base Warranties | Gap | Lease Wear & Tear | Limited Warranties | LOF | Multicare | New Majors Stated Time | New Majors Unlimited | Owner Care | Short Term Lease | Tire & Wheel | Used Majors Stated Time | Used Majors Unlimited | Grand Total | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1751 | 736057360 | 1145.0 | 122225.0 | 36575.0 | 1950.0 | 120.0 | 39740.0 | NaN | 10602583.0 | 646190.0 | 860.0 | 25990.0 | 56645.0 | 1500865.0 | 105560.0 | 13140448.0 |
| 1459 | 825498254 | 2970.0 | 506775.0 | 348615.0 | 56530.0 | 77200.0 | 344910.0 | 12325.0 | 960405.0 | 58590.0 | 765.0 | 82750.0 | 80275.0 | 1339665.0 | 19265.0 | 3891040.0 |
| 755 | 894338943 | NaN | 254275.0 | NaN | 48195.0 | NaN | 226290.0 | NaN | 2417275.0 | 32970.0 | 20665.0 | 12480.0 | NaN | 716220.0 | 3195.0 | 3731565.0 |
| 118 | 052410524 | 1180.0 | 301300.0 | 650105.0 | 220745.0 | 18465.0 | 75615.0 | 11715.0 | 1521235.0 | 84645.0 | 13130.0 | 1640.0 | 23035.0 | 790835.0 | 15885.0 | 3729530.0 |
| 117 | 002410024 | NaN | NaN | 432190.0 | 2550.0 | NaN | 360220.0 | NaN | 1517635.0 | 27895.0 | 416465.0 | 24510.0 | 1105.0 | 727605.0 | 11600.0 | 3521775.0 |
| 2184 | 326673266 | 49060.0 | 316100.0 | 519885.0 | 28685.0 | NaN | 266685.0 | 31065.0 | 862255.0 | 25150.0 | 185915.0 | 2325.0 | 232790.0 | 935105.0 | 11755.0 | 3466775.0 |
| 1706 | 056050560 | 37290.0 | NaN | NaN | -375.0 | NaN | 3965.0 | NaN | 2561140.0 | 273980.0 | NaN | NaN | 56325.0 | 496345.0 | 32750.0 | 3461420.0 |
| 822 | 314393143 | 1000.0 | 276050.0 | 317295.0 | 21750.0 | 65445.0 | 540830.0 | 1890.0 | 1167190.0 | 36095.0 | 101415.0 | NaN | NaN | 842865.0 | 9325.0 | 3381150.0 |
| 2326 | 486834868 | NaN | 402200.0 | NaN | 84455.0 | NaN | 165340.0 | NaN | 1022305.0 | 9935.0 | 691685.0 | 860.0 | 3260.0 | 891815.0 | NaN | 3271855.0 |
top_10.plot(x ='Store id', y='Grand Total', kind = 'bar',color="#911eb4")
plt.ylabel("Products sold")
plt.xlabel('Store')
plt.title("Top 10 stores by products sold'")
plt.show()
From above plot we could see store ID 736057360 have the highest grand total which means have the highest sales and gave the maximum profit, so we can consider this as a baseline store to compare other stores and find the defects so that it can be used to increase the revenue of other stores
ascending = products_sold_df.sort_values("Grand Total",ascending = True)
bottom_10 =ascending[:10]
bottom_10
| Store id | Auto Appearance Care | CPOV Base Warranties | Gap | Lease Wear & Tear | Limited Warranties | LOF | Multicare | New Majors Stated Time | New Majors Unlimited | Owner Care | Short Term Lease | Tire & Wheel | Used Majors Stated Time | Used Majors Unlimited | Grand Total | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 272 | 882688826 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -1315.0 | NaN | NaN | NaN | NaN | NaN | NaN | -1315.0 |
| 1144 | 264542645 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -225.0 | NaN | NaN | NaN | NaN | NaN | -225.0 |
| 787 | 404364043 | NaN | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 |
| 714 | 264302643 | NaN | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 |
| 1122 | 694536945 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | 0.0 |
| 400 | 702707027 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 |
| 2203 | 336693366 | NaN | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 |
| 1404 | 054580545 | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 |
| 54 | 282302823 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 |
| 1599 | 006030060 | NaN | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 |
bottom_10.plot(x ='Store id', y='Grand Total', kind = 'bar',color="#db67aa")
plt.ylabel("Products sold")
plt.xlabel('Store')
plt.title("Bottom, 10 stores by products sold'")
plt.show()
From the above plot we can see store id 882688826 and 264542645 are performing worst and we need to foucus on it.
correlation = Sales_df.corr()
#tick labels
matrix_cols = correlation.columns.tolist()
#convert to array
corr_array = np.array(correlation)
correlation
| STAR Dealer ? | Revenue Total | PI | EligMaj | EligConv | 2021 Customer Base | 2021 Products (Qty) | 2021 Revenue ($) | 2021 PEN | 2021 $PEN | 2020 Revenue ($) | 2020 Contracts ($) | 2020 Customer Base | 2020 PEN | 2020 $PEN | 10 Year VIOs | Trade Zone Retention | Tech Counts | Stall Counts | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| STAR Dealer ? | 1.000000 | 0.424537 | 0.021449 | 0.427808 | 0.296347 | 0.083395 | 0.332474 | 0.421117 | 0.328492 | 0.280782 | 0.387520 | 0.305729 | 0.090347 | 0.333781 | 0.361028 | 0.142918 | 0.098181 | 0.153338 | 0.148684 |
| Revenue Total | 0.424537 | 1.000000 | 0.026137 | 0.922602 | 0.687464 | 0.503606 | 0.891798 | 0.998633 | 0.475966 | 0.398324 | 0.935411 | 0.835242 | 0.510376 | 0.448595 | 0.452708 | 0.433319 | 0.128977 | 0.484695 | 0.403426 |
| PI | 0.021449 | 0.026137 | 1.000000 | 0.011926 | 0.122641 | -0.032504 | 0.034145 | 0.024571 | 0.069605 | 0.033541 | 0.021826 | 0.027847 | -0.038047 | 0.058847 | 0.031381 | -0.023541 | -0.072652 | -0.032095 | -0.035594 |
| EligMaj | 0.427808 | 0.922602 | 0.011926 | 1.000000 | 0.661996 | 0.377307 | 0.745803 | 0.916632 | 0.397476 | 0.377220 | 0.874923 | 0.704273 | 0.393343 | 0.368556 | 0.431136 | 0.289364 | 0.094066 | 0.340280 | 0.301388 |
| EligConv | 0.296347 | 0.687464 | 0.122641 | 0.661996 | 1.000000 | 0.175899 | 0.682402 | 0.682609 | 0.366857 | 0.218699 | 0.635386 | 0.617859 | 0.180811 | 0.379718 | 0.293442 | 0.236842 | 0.040947 | 0.213013 | 0.214441 |
| 2021 Customer Base | 0.083395 | 0.503606 | -0.032504 | 0.377307 | 0.175899 | 1.000000 | 0.553390 | 0.527825 | -0.020124 | -0.065299 | 0.519615 | 0.542625 | 0.944097 | -0.005578 | -0.053145 | 0.630313 | 0.146159 | 0.677717 | 0.538396 |
| 2021 Products (Qty) | 0.332474 | 0.891798 | 0.034145 | 0.745803 | 0.682402 | 0.553390 | 1.000000 | 0.898525 | 0.478058 | 0.238445 | 0.852518 | 0.933931 | 0.558400 | 0.489364 | 0.311457 | 0.537111 | 0.154962 | 0.575362 | 0.480261 |
| 2021 Revenue ($) | 0.421117 | 0.998633 | 0.024571 | 0.916632 | 0.682609 | 0.527825 | 0.898525 | 1.000000 | 0.469142 | 0.388342 | 0.939608 | 0.844937 | 0.538308 | 0.444365 | 0.443003 | 0.456907 | 0.136731 | 0.508459 | 0.424722 |
| 2021 PEN | 0.328492 | 0.475966 | 0.069605 | 0.397476 | 0.366857 | -0.020124 | 0.478058 | 0.469142 | 1.000000 | 0.808301 | 0.412016 | 0.430486 | 0.001739 | 0.827777 | 0.714418 | 0.123099 | 0.045870 | 0.132257 | 0.109322 |
| 2021 $PEN | 0.280782 | 0.398324 | 0.033541 | 0.377220 | 0.218699 | -0.065299 | 0.238445 | 0.388342 | 0.808301 | 1.000000 | 0.324363 | 0.205879 | -0.047385 | 0.551680 | 0.801734 | 0.008597 | -0.026579 | 0.017209 | 0.001227 |
| 2020 Revenue ($) | 0.387520 | 0.935411 | 0.021826 | 0.874923 | 0.635386 | 0.519615 | 0.852518 | 0.939608 | 0.412016 | 0.324363 | 1.000000 | 0.902747 | 0.562473 | 0.457235 | 0.437863 | 0.463559 | 0.174805 | 0.535483 | 0.456540 |
| 2020 Contracts ($) | 0.305729 | 0.835242 | 0.027847 | 0.704273 | 0.617859 | 0.542625 | 0.933931 | 0.844937 | 0.430486 | 0.205879 | 0.902747 | 1.000000 | 0.586767 | 0.527111 | 0.318862 | 0.551868 | 0.197528 | 0.604742 | 0.510338 |
| 2020 Customer Base | 0.090347 | 0.510376 | -0.038047 | 0.393343 | 0.180811 | 0.944097 | 0.558400 | 0.538308 | 0.001739 | -0.047385 | 0.562473 | 0.586767 | 1.000000 | 0.019980 | -0.032514 | 0.703460 | 0.236054 | 0.753340 | 0.619627 |
| 2020 PEN | 0.333781 | 0.448595 | 0.058847 | 0.368556 | 0.379718 | -0.005578 | 0.489364 | 0.444365 | 0.827777 | 0.551680 | 0.457235 | 0.527111 | 0.019980 | 1.000000 | 0.743549 | 0.171405 | 0.136818 | 0.186256 | 0.164097 |
| 2020 $PEN | 0.361028 | 0.452708 | 0.031381 | 0.431136 | 0.293442 | -0.053145 | 0.311457 | 0.443003 | 0.714418 | 0.801734 | 0.437863 | 0.318862 | -0.032514 | 0.743549 | 1.000000 | 0.048472 | 0.037430 | 0.064652 | 0.052707 |
| 10 Year VIOs | 0.142918 | 0.433319 | -0.023541 | 0.289364 | 0.236842 | 0.630313 | 0.537111 | 0.456907 | 0.123099 | 0.008597 | 0.463559 | 0.551868 | 0.703460 | 0.171405 | 0.048472 | 1.000000 | 0.239088 | 0.796353 | 0.724269 |
| Trade Zone Retention | 0.098181 | 0.128977 | -0.072652 | 0.094066 | 0.040947 | 0.146159 | 0.154962 | 0.136731 | 0.045870 | -0.026579 | 0.174805 | 0.197528 | 0.236054 | 0.136818 | 0.037430 | 0.239088 | 1.000000 | 0.383827 | 0.351931 |
| Tech Counts | 0.153338 | 0.484695 | -0.032095 | 0.340280 | 0.213013 | 0.677717 | 0.575362 | 0.508459 | 0.132257 | 0.017209 | 0.535483 | 0.604742 | 0.753340 | 0.186256 | 0.064652 | 0.796353 | 0.383827 | 1.000000 | 0.714294 |
| Stall Counts | 0.148684 | 0.403426 | -0.035594 | 0.301388 | 0.214441 | 0.538396 | 0.480261 | 0.424722 | 0.109322 | 0.001227 | 0.456540 | 0.510338 | 0.619627 | 0.164097 | 0.052707 | 0.724269 | 0.351931 | 0.714294 | 1.000000 |
#Plotting
trace = go.Heatmap(z = corr_array,
x = matrix_cols,
y = matrix_cols,
colorscale = "Viridis",
colorbar = dict(title = "Pearson Correlation coefficient",
titleside = "right"
) ,
)
layout = go.Layout(dict(title = "Correlation Matrix for variables",
autosize = False,
height = 720,
width = 800,
margin = dict(r = 0 ,l = 210,
t = 25,b = 210,
),
yaxis = dict(tickfont = dict(size = 9)),
xaxis = dict(tickfont = dict(size = 9))
)
)
data = [trace]
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
df_new=Sales_df.drop(columns=['Region','Territory','Size','City','State','Market type','Segmentation','Service Absorption %', '2021 Revenue ($)', '2020 Revenue ($)', '2021 $PEN','2021 Products (Qty)'])
from sklearn.model_selection import train_test_split
target_var = 'Revenue Total'
all_features = [x for x in df_new.columns if x not in ['Store id', target_var]]
x_train, x_test, y_train, y_test = train_test_split(
df_new[all_features],
df_new[target_var],
test_size=0.2
)
from sklearn.linear_model import LinearRegression
reg_fit = LinearRegression()
reg_fit.fit(x_train, y_train)
LinearRegression()
# Calculate the intercept
reg_fit.intercept_
-22903.374580512114
# Calculate the coefficient
reg_fit.coef_
array([ 2.07935830e+04, 3.37021871e+02, 9.53610511e+02, 9.17808920e+01,
1.99860240e+02, 3.19645096e+04, 1.79273532e+02, -1.50622612e+02,
-4.82624010e+04, 4.57582450e+01, 3.34915792e+00, 2.28329173e+04,
1.44531898e+03, -6.38979321e+02])
train_preds = reg_fit.predict(x_train)
test_preds = reg_fit.predict(x_test)
from sklearn.metrics import median_absolute_error
print('In-Sample MSE: %0.4f' % median_absolute_error(y_true=y_train, y_pred=train_preds))
print('Out-of-Sample MSE: %0.4f' % median_absolute_error(y_true=y_test, y_pred=test_preds))
In-Sample MSE: 24046.4064 Out-of-Sample MSE: 22517.0573
from sklearn.metrics import r2_score
print('In-Sample R-Squared: %0.4f' % r2_score(y_true=y_train, y_pred=train_preds))
print('Out-of-Sample R-Squared: %0.4f' % r2_score(y_true=y_test, y_pred=test_preds))
In-Sample R-Squared: 0.9410 Out-of-Sample R-Squared: 0.9403
# scatter plot of predicted versus actual.
plt.scatter(y_test, test_preds)
plt.plot([0, max(y_test)], [0, max(test_preds)], color='gray', lw=1, linestyle='--')
plt.xlabel('actual')
plt.ylabel('predicted')
plt.title('Out-of-Sample Actual vs. Predicted')
plt.grid()
plt.show()
After removing strongly correlated variables like 2021 Revenue, 2020 Revenue, 2021 PEN, 2021 Products (Qty) , we got nearly 94% of accuracy which is quite good.